The goal of this working group is to benchmark recent covariate-adjusted FDR methods. For complete details, check out our benchmark-fdr GitHub repo.
In this R Markdown, we use a Monte Carlo simulation study to evaluate the All nulls simulation (all tests are simulated from the null distribution i.e. no difference). We define
Methods to evaluate:
Metrics to evaluate:
Load libraries:
library(dplyr)
library(ggplot2)
library(cowplot)
library(doParallel)Additional packages that should be installed are:
library(genefilter)
library(IHW)
library(ashr)
library(qvalue)
library(swfdr)
library(fdrtool)
## to use IHWpaper::scott_fdrreg wrapper, need to install FDRreg v2.0 (only available github):
## devtools::install_github(repo= 'jgscott/FDRreg', subdir='R_pkg/')
library(FDRreg)Descriptive summary of simulation study:
du_ttest_sim():
Generate example data set with an uninformative covariate and informative covariate:
n_samples = 20
n_groups = 2
# uninformative covariate = uniform(0,1)
sim_df_uninform <- du_ttest_sim(m=m, pi0=1, effect_size=0,
n_samples=n_samples, n_groups = 2,
informative_ind_covariate = FALSE)
# informative covariate = pooled variance
sim_df_inform <- du_ttest_sim(m=m, pi0=1, effect_size=0,
n_samples=n_samples, n_groups = 2,
informative_ind_covariate = TRUE)
head(sim_df_inform)## H test_statistic effect_size pval ind_covariate SE
## 1 0 1.03415103 0.46307310 0.31476063 1.0030973 0.4477809
## 2 0 1.82432991 0.79961352 0.08474789 1.0383932 0.4383053
## 3 0 -1.66779670 -0.72815880 0.11265886 1.0210103 0.4365993
## 4 0 1.36144510 0.65679004 0.19017299 1.1026902 0.4824212
## 5 0 0.05598207 0.02487557 0.95597282 0.9671779 0.4443489
## 6 0 0.31964617 0.11375457 0.75291677 0.7767350 0.3558765
Inside the simulated “null” dataset object:
H is an indicator representing the true null (0) or alternative status (1)test_statistic is the t-test statisticeffect_size is the difference in meanspval is the \(p\)-value from the t-testind_covariate the independent covariate (can be informative or uninformative). Here it’s informativeSE is the standard error (\(\frac{\sigma}{\sqrt{n}}\))Need to check two things:
colnames(sim_df_uninform)[which(colnames(sim_df_uninform) == "pval")] <- "pvals"
strat_hist(sim_df_uninform, pval="pvals", covariate = "ind_covariate", maxy=2)colnames(sim_df_inform)[which(colnames(sim_df_inform) == "pval")] <- "pvals"
strat_hist(sim_df_inform, pval="pvals", covariate = "ind_covariate", maxy=2)This does not mean the covariate is informative. We use the second diagnostic plot to check this.
rank_scatter(sim_df_uninform, pval="pvals", covariate = "ind_covariate", bins=100)rank_scatter(sim_df_inform, pval="pvals", covariate = "ind_covariate", bins=100)Run following on odyssey and save output as RDS:
library(dplyr)
library(ggplot2)
library(doParallel)
workingPath <- "/net/irizarryfs01/srv/export/irizarryfs01_backed_up/share_root/shicks/projects/benchmark-fdr"
source(file.path(workingPath, "simulation-helpers.R"))
source(file.path(workingPath, "evaluation-helpers.R"))
# set these to a smaller number temporarily
m = 20000 # number of tests
M = 500 # number of MC reps
n_samples = 20
n_groups = 2
alphas = seq(0.01, 0.1, length.out = 10)
nCores <- 10
registerDoParallel(cores = nCores)
workers <- getDoParWorkers()
backend <- getDoParName()
version <- getDoParVersion()
# uninformative covariate (uniform[0,1])
df.out.uninform <- foreach(i = 1:M, .verbose=T) %dopar% {
sim_df_uninform <- du_ttest_sim(m=m, pi0=1, effect_size=0, n_samples=n_samples,
n_groups=n_groups, informative_ind_covariate = FALSE)
sim_runner(sim = sim_df_uninform, alphas = alphas, pvals = FALSE)
}
df.out.uninform <- dplyr::bind_rows(df.out.uninform)
saveRDS(df.out.uninform, file=file.path(workingPath, "Results-nullSims-uninform.RDS"))
# informative covariate (pooled variance)
df.out.inform <- foreach(i = 1:M, .verbose=T) %dopar% {
sim_df_inform <- du_ttest_sim(m=m, pi0=1, effect_size=0, n_samples=n_samples,
n_groups=n_groups, informative_ind_covariate = TRUE)
sim_runner(sim = sim_df_inform, alphas = alphas, pvals = FALSE)
}
df.out.inform <- dplyr::bind_rows(df.out.inform)
saveRDS(df.out.inform, file=file.path(workingPath, "Results-nullSims-inform.RDS"))Here we will plot the number of discoveries as a function of \(\alpha\) (should be 0 false discoveries). We will also evaluate methods control for false discovery rate (FDR) and family-wise error rate (FWER) at nominal \(\alpha\) level ranging from [0.01, 0.1].
Load and plot results from RDS file:
# workingPath <- "/net/irizarryfs01/srv/export/irizarryfs01_backed_up/share_root/shicks/projects/benchmark-fdr"
# df.out.uninform <- readRDS(file=file.path(workingPath, "Results-nullSims-uninform.RDS"))
df.out.uninform <- readRDS(file=file.path(RPROJ$PROJHOME, "R/Results-nullSims-uninform.RDS"))
# Number of discoveries as a function of $\alpha$
df.out.uninform %>% dplyr::group_by(method, alpha) %>%
dplyr::summarize(n_rejects = mean(n_rejects), FDR = mean(FDP),
FWER = mean(FWER), FPR = mean(FPR)) %>%
ggplot(aes(x = alpha, y = n_rejects, color = method)) +
geom_line() + geom_point() + geom_abline(linetype="dashed") +
xlab(expression(bold(paste("Nominal ",alpha)))) + ylab("Number of Discoveries") +
theme(axis.title = element_text(face="bold") ) # False discovery rate as a function of $\alpha$
df.out.uninform %>% dplyr::group_by(method, alpha) %>%
dplyr::summarize(n_rejects = mean(n_rejects), FDR = mean(FDP),
FWER = mean(FWER), FPR = mean(FPR)) %>%
ggplot(aes(x = alpha, y = FDR, color = method)) +
geom_line() + geom_point() + geom_abline(linetype="dashed") +
xlab(expression(bold(paste("Nominal ",alpha)))) + ylab("FDR") +
theme(axis.title = element_text(face="bold") ) # Family wise error rate as a function of $\alpha$
df.out.uninform %>% dplyr::group_by(method, alpha) %>%
dplyr::summarize(n_rejects = mean(n_rejects), FDR = mean(FDP),
FWER = mean(FWER), FPR = mean(FPR)) %>%
ggplot(aes(x = alpha, y = FWER, color = method)) +
geom_line() + geom_point() + geom_abline(linetype="dashed") +
xlab(expression(bold(paste("Nominal ",alpha)))) + ylab("FWER") +
theme(axis.title = element_text(face="bold") )Load and plot results from RDS file:
# workingPath <- "/net/irizarryfs01/srv/export/irizarryfs01_backed_up/share_root/shicks/projects/benchmark-fdr"
# df.out.inform <- readRDS(file=file.path(workingPath, "Results-nullSims-inform.RDS"))
df.out.inform <- readRDS(file=file.path(RPROJ$PROJHOME, "R/Results-nullSims-inform.RDS"))
# Number of discoveries as a function of $\alpha$
df.out.inform %>% dplyr::group_by(method, alpha) %>%
dplyr::summarize(n_rejects = mean(n_rejects), FDR = mean(FDP),
FWER = mean(FWER), FPR = mean(FPR)) %>%
ggplot(aes(x = alpha, y = n_rejects, color = method)) +
geom_line() + geom_point() + geom_abline(linetype="dashed") +
xlab(expression(bold(paste("Nominal ",alpha)))) + ylab("Number of Discoveries") +
theme(axis.title = element_text(face="bold") )# False discovery rate as a function of $\alpha$
df.out.inform %>% dplyr::group_by(method, alpha) %>%
dplyr::summarize(n_rejects = mean(n_rejects), FDR = mean(FDP),
FWER = mean(FWER), FPR = mean(FPR)) %>%
ggplot(aes(x = alpha, y = FDR, color = method)) +
geom_line() + geom_point() + geom_abline(linetype="dashed") +
xlab(expression(bold(paste("Nominal ",alpha)))) + ylab("FDR") +
theme(axis.title = element_text(face="bold") )# Family wise error rate as a function of $\alpha$
df.out.inform %>% dplyr::group_by(method, alpha) %>%
dplyr::summarize(n_rejects = mean(n_rejects), FDR = mean(FDP),
FWER = mean(FWER), FPR = mean(FPR)) %>%
ggplot(aes(x = alpha, y = FWER, color = method)) +
geom_line() + geom_point() + geom_abline(linetype="dashed") +
xlab(expression(bold(paste("Nominal ",alpha)))) + ylab("FWER") +
theme(axis.title = element_text(face="bold") )Descriptive summary of simulation study:
du_ttest_sim():
Generate example data set with an uninformative covariate and informative covariate:
n_samples = 20
n_groups = 2
# uninformative covariate = uniform(0,1)
sim_df_uninform <- du_ttest_sim(m=m, pi0=0.7, effect_size=1.5,
n_samples=n_samples, n_groups = 2,
informative_ind_covariate = FALSE)
# informative covariate = pooled variance
sim_df_inform <- du_ttest_sim(m=m, pi0=0.7, effect_size=1.5,
n_samples=n_samples, n_groups = 2,
informative_ind_covariate = TRUE)
head(sim_df_inform)## H test_statistic effect_size pval ind_covariate SE
## 1 1 -7.7836115 -2.50933829 3.613498e-07 1.4660710 0.3223874
## 2 1 -2.4353030 -1.28778743 2.550553e-02 1.3270188 0.5287997
## 3 0 -0.1087734 -0.04055024 9.145856e-01 0.8116297 0.3727956
## 4 0 -1.5789319 -0.63478587 1.317638e-01 0.9336307 0.4020350
## 5 1 -2.1718859 -1.00436230 4.347181e-02 1.1306754 0.4624379
## 6 0 -0.4710017 -0.17630037 6.432973e-01 0.8196625 0.3743094
Need to check two things:
colnames(sim_df_uninform)[which(colnames(sim_df_uninform) == "pval")] <- "pvals"
strat_hist(sim_df_uninform, pval="pvals", covariate = "ind_covariate", maxy=2)colnames(sim_df_inform)[which(colnames(sim_df_inform) == "pval")] <- "pvals"
strat_hist(sim_df_inform, pval="pvals", covariate = "ind_covariate", maxy=2)This does not mean the covariate is informative. We use the second diagnostic plot to check this.
rank_scatter(sim_df_uninform, pval="pvals", covariate = "ind_covariate", bins=100)rank_scatter(sim_df_inform, pval="pvals", covariate = "ind_covariate", bins=100)Run following on odyssey and save output as RDS:
library(dplyr)
library(ggplot2)
library(doParallel)
workingPath <- "/net/irizarryfs01/srv/export/irizarryfs01_backed_up/share_root/shicks/projects/benchmark-fdr"
source(file.path(workingPath, "simulation-helpers.R"))
source(file.path(workingPath, "evaluation-helpers.R"))
# set these to a smaller number temporarily
m = 20000 # number of tests
M = 500 # number of MC reps
n_samples = 20
n_groups = 2
alphas = seq(0.01, 0.1, length.out = 10)
pi0set <- c(0, 0.1, 0.3, 0.7, 0.9, 1)
nCores <- 10
registerDoParallel(cores = nCores)
workers <- getDoParWorkers()
backend <- getDoParName()
version <- getDoParVersion()
# uninformative covariate (uniform[0,1])
df.out.uninform <- foreach(i = 1:M, .verbose=T) %dopar% {
dat <- NULL
for(i in 1:length(pi0set)){
sim_df_uninform <- du_ttest_sim(m=m, pi0=pi0set[i], effect_size=1.5, n_samples=n_samples,
n_groups=n_groups, informative_ind_covariate = FALSE)
dat <- rbind(dat, data.frame(sim_runner(sim = sim_df_uninform, alphas = alphas, pvals = FALSE),
"pi0" = pi0set[i]))
}
dat
}
df.out.uninform <- dplyr::bind_rows(df.out.uninform)
saveRDS(df.out.uninform, file=file.path(workingPath, "Results-varyingpi0Sims-uninform.RDS"))
# informative covariate (pooled variance)
df.out.inform <- foreach(i = 1:M, .verbose=T) %dopar% {
dat <- NULL
for(i in 1:length(pi0set)){
sim_df_inform <- du_ttest_sim(m=m, pi0=pi0set[i], effect_size=1.5, n_samples=n_samples,
n_groups=n_groups, informative_ind_covariate = TRUE)
dat <- rbind(dat, data.frame(sim_runner(sim = sim_df_inform, alphas = alphas, pvals = FALSE),
"pi0" = pi0set[i]))
}
dat
}
df.out.inform <- dplyr::bind_rows(df.out.inform)
saveRDS(df.out.inform, file=file.path(workingPath, "Results-varyingpi0Sims-inform.RDS"))Here we will plot the number of discoveries as a function of \(\alpha\). We will also evaluate methods control for false discovery rate (FDR), family-wise error rate (FWER) and Power at nominal \(\alpha\) level ranging from [0.01, 0.1].
Load and plot results from RDS file:
# workingPath <- "/net/irizarryfs01/srv/export/irizarryfs01_backed_up/share_root/shicks/projects/benchmark-fdr"
# df.out.uninform <- readRDS(file=file.path(workingPath, "Results-varyingpi0Sims-uninform"))
df.out.uninform <- readRDS(file=file.path(RPROJ$PROJHOME, "R/Results-varyingpi0Sims-uninform"))
# Number of discoveries as a function of $\alpha$
df.out.uninform %>% dplyr::group_by(method, alpha, pi0) %>%
dplyr::summarize(n_rejects = mean(n_rejects), FDR = mean(FDP),
FWER = mean(FWER), FPR = mean(FPR)) %>%
ggplot(aes(x = alpha, y = n_rejects, color = method)) +
geom_line() + geom_abline(linetype="dashed") +
xlab(expression(bold(paste("Nominal ",alpha)))) + ylab("Number of Discoveries") +
theme(axis.title = element_text(face="bold") ) +
facet_grid(~ pi0)
# False discovery rate as a function of $\alpha$
df.out.uninform %>% dplyr::group_by(method, alpha, pi0) %>%
dplyr::summarize(n_rejects = mean(n_rejects), FDR = mean(FDP),
FWER = mean(FWER), FPR = mean(FPR)) %>%
ggplot(aes(x = alpha, y = FDR, color = method)) +
geom_line() + geom_abline(linetype="dashed") +
xlab(expression(bold(paste("Nominal ",alpha)))) + ylab("FDR") +
theme(axis.title = element_text(face="bold") ) +
facet_grid(~ pi0)
# Family wise error rate as a function of $\alpha$
df.out.uninform %>% dplyr::group_by(method, alpha, pi0) %>%
dplyr::summarize(n_rejects = mean(n_rejects), FDR = mean(FDP),
FWER = mean(FWER), FPR = mean(FPR)) %>%
ggplot(aes(x = alpha, y = FWER, color = method)) +
geom_line() + geom_abline(linetype="dashed") +
xlab(expression(bold(paste("Nominal ",alpha)))) + ylab("FWER") +
theme(axis.title = element_text(face="bold") ) +
facet_grid(~ pi0)
# Power as a function of $\alpha$
df.out.uninform %>% dplyr::group_by(method, alpha, pi0) %>%
dplyr::summarize(n_rejects = mean(n_rejects), Power = mean(power)) %>%
ggplot(aes(x = alpha, y = Power, color = method)) +
geom_line() +
xlab(expression(bold(paste("Nominal ",alpha)))) + ylab("Power") +
theme(axis.title = element_text(face="bold") ) +
facet_grid(~ pi0)Load and plot results from RDS file:
# workingPath <- "/net/irizarryfs01/srv/export/irizarryfs01_backed_up/share_root/shicks/projects/benchmark-fdr"
# df.out.inform <- readRDS(file=file.path(workingPath, "Results-varyingpi0Sims-inform"))
df.out.inform <- readRDS(file=file.path(RPROJ$PROJHOME, "R/Results-varyingpi0Sims-inform"))
# Number of discoveries as a function of $\alpha$
df.out.inform %>% dplyr::group_by(method, alpha, pi0) %>%
dplyr::summarize(n_rejects = mean(n_rejects), FDR = mean(FDP),
FWER = mean(FWER), FPR = mean(FPR)) %>%
ggplot(aes(x = alpha, y = n_rejects, color = method)) +
geom_line() + geom_abline(linetype="dashed") +
xlab(expression(bold(paste("Nominal ",alpha)))) + ylab("Number of Discoveries") +
theme(axis.title = element_text(face="bold") ) +
facet_grid(~ pi0)
# False discovery rate as a function of $\alpha$
df.out.inform %>% dplyr::group_by(method, alpha, pi0) %>%
dplyr::summarize(n_rejects = mean(n_rejects), FDR = mean(FDP),
FWER = mean(FWER), FPR = mean(FPR)) %>%
ggplot(aes(x = alpha, y = FDR, color = method)) +
geom_line() + geom_abline(linetype="dashed") +
xlab(expression(bold(paste("Nominal ",alpha)))) + ylab("FDR") +
theme(axis.title = element_text(face="bold") ) +
facet_grid(~ pi0)
# Family wise error rate as a function of $\alpha$
df.out.inform %>% dplyr::group_by(method, alpha, pi0) %>%
dplyr::summarize(n_rejects = mean(n_rejects), FDR = mean(FDP),
FWER = mean(FWER), FPR = mean(FPR)) %>%
ggplot(aes(x = alpha, y = FWER, color = method)) +
geom_line() + geom_abline(linetype="dashed") +
xlab(expression(bold(paste("Nominal ",alpha)))) + ylab("FWER") +
theme(axis.title = element_text(face="bold") ) +
facet_grid(~ pi0)
# Power as a function of $\alpha$
df.out.inform %>% dplyr::group_by(method, alpha, pi0) %>%
dplyr::summarize(n_rejects = mean(n_rejects), Power = mean(power)) %>%
ggplot(aes(x = alpha, y = Power, color = method)) +
geom_line() +
xlab(expression(bold(paste("Nominal ",alpha)))) + ylab("Power") +
theme(axis.title = element_text(face="bold") ) +
facet_grid(~ pi0)